{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6",
   "metadata": {},
   "source": [
    "# Data Acquisition and Cleansing Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a360772e-7829-4c15-9af9-d4596efc7351",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c98c7640-1472-4869-9fdd-f070d665ae1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()\n",
    "\n",
    "new_column_names = {\n",
    "    \"Release\": \"release_date\",\n",
    "    \"Movie\": \"movie_title\",\n",
    "    \"Bond\": \"bond_actor\",\n",
    "    \"Bond_Car_MFG\": \"car_manufacturer\",\n",
    "    \"US_Gross\": \"income_usa\",\n",
    "    \"World_Gross\": \"income_world\",\n",
    "    \"Budget ($ 000s)\": \"movie_budget\",\n",
    "    \"Film_Length\": \"film_length\",\n",
    "    \"Avg_User_IMDB\": \"imdb\",\n",
    "    \"Avg_User_Rtn_Tom\": \"rotten_tomatoes\",\n",
    "    \"Martinis\": \"martinis_consumed\",\n",
    "    \"Kills_Bond\": \"bond_kills\",\n",
    "}\n",
    "\n",
    "data = james_bond_data.rename(columns=new_column_names)\n",
    "\n",
    "data = (\n",
    "    james_bond_data.rename(columns=new_column_names)\n",
    "    .combine_first(\n",
    "        pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n",
    "    )\n",
    "    .assign(\n",
    "        income_usa=lambda data: (\n",
    "            data[\"income_usa\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        income_world=lambda data: (\n",
    "            data[\"income_world\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "        ),\n",
    "        movie_budget=lambda data: (\n",
    "            data[\"movie_budget\"]\n",
    "            .replace(\"[$,]\", \"\", regex=True)\n",
    "            .astype(\"Float64\")\n",
    "            * 1000\n",
    "        ),\n",
    "        film_length=lambda data: (\n",
    "            data[\"film_length\"]\n",
    "            .str.removesuffix(\"mins\")\n",
    "            .astype(\"Int64\")\n",
    "            .replace(1200, 120)\n",
    "        ),\n",
    "        release_date=lambda data: pd.to_datetime(\n",
    "            data[\"release_date\"], format=\"%B, %Y\"\n",
    "        ),\n",
    "        release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n",
    "        bond_actor=lambda data: (\n",
    "            data[\"bond_actor\"]\n",
    "            .str.replace(\"Shawn\", \"Sean\")\n",
    "            .str.replace(\"MOORE\", \"Moore\")\n",
    "        ),\n",
    "        car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n",
    "            \"Astin\", \"Aston\"\n",
    "        ),\n",
    "        martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n",
    "            -6, 6\n",
    "        ),\n",
    "    )\n",
    "    .drop_duplicates(ignore_index=True)\n",
    ")\n",
    "\n",
    "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0",
   "metadata": {},
   "source": [
    "# Data Analysis Code"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e",
   "metadata": {},
   "source": [
    "## Performing a Regression Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install matplotlib scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "x = data.loc[:, [\"imdb\"]]\n",
    "y = data.loc[:, \"rotten_tomatoes\"]\n",
    "\n",
    "model = LinearRegression()\n",
    "model.fit(x, y)\n",
    "\n",
    "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n",
    "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n",
    "y_pred = model.predict(x)\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "ax.scatter(x, y)\n",
    "ax.plot(x, y_pred, color=\"red\")\n",
    "ax.text(7.25, 5.5, r_squared, fontsize=10)\n",
    "ax.text(7.25, 7, best_fit, fontsize=10)\n",
    "ax.set_title(\"Scatter Plot of Ratings\")\n",
    "ax.set_xlabel(\"Average IMDb Rating\")\n",
    "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n",
    "# fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b38df412-c320-49fb-93ae-e253405537a8",
   "metadata": {},
   "source": [
    "## Investigating a Statistical Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "938e5942-e57f-4e41-99f1-215cfb37d0df",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n",
    "length.plot.bar(\n",
    "    ax=ax,\n",
    "    title=\"Film Length Distribution\",\n",
    "    xlabel=\"Time Range (mins)\",\n",
    "    ylabel=\"Count\",\n",
    ")\n",
    "# fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff4e9955-baf4-48eb-b032-fbf55f439194",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"film_length\"].agg([\"min\", \"max\", \"mean\", \"std\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870",
   "metadata": {},
   "source": [
    "## Finding No Relationship"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb83374-347f-4cf6-bc21-8180a003371d",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.scatter(data[\"imdb\"], data[\"bond_kills\"])\n",
    "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n",
    "ax.set_xlabel(\"Average IMDb Rating\")\n",
    "ax.set_ylabel(\"Kills by Bond\")\n",
    "# fig.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
